if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load("edgeR", "readr", "readxl", "biomaRt", "magrittr", "tibble", "stringr",
"ggplot2", "data.table", "patchwork", "openxlsx", "dplyr", "missForest",
"RColorBrewer", "limma", "DEqMS", "preprocessCore", "DEP",
"SummarizedExperiment", "Metrics", "fdrtool", "aamisc", "sva")
# install aamisc package for MDS and Volcano plots
# Commented out for future use:
# pacman::p_load("qvalue", "rain", "limma", "devtools")
# url <- "https://cran.r-project.org/src/contrib/Archive/HarmonicRegression/HarmonicRegression_1.0.tar.gz"
# pkgFile <- "HarmonicRegression_1.0.tar.gz"
# download.file(url = url, destfile = pkgFile)
# install.packages(pkgs=pkgFile, type="source", repos=NULL)
# file.remove(pkgFile)
# pacman::p_load_gh("altintasali/aamisc")
# Colours for publication
publication_colors <- c("Control" = "#285291", "Metformin" = "#7C1516", "Sham" = "#9B9B9B")
# Define file paths for proteomics data, metadata, and gene annotations
count_file <- "../../../data/count/report.unique_genes_matrix.tsv"
meta_file <- "../../../data/metadata/meta_proteomics.xlsx"
geneinfo_file <- "../../../data/gene_annotation/horse_gene_annotation.tsv.gz"
# 1. Read the Proteomics Count Data
count <- readr::read_delim(count_file)
## Rows: 2967 Columns: 74
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (1): Genes
## dbl (73): D:\Mass_spectrometry\Raw_data\Joakim\Simon Horse second run2\1A_RA...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 2. Read Metadata
meta <- readxl::read_excel(meta_file)
# 3. Clean Count Data
# Remove any unnecessary blank columns created during MS processing
count <- count %>% select(-matches("D:\\\\Mass_spectrometry\\\\Raw_data\\\\Joakim\\\\Simon Horse second run2\\\\10A_RA11_1_24830.d"))
# 4. Load and Clean Gene Annotation Data
geneinfo <- fread(geneinfo_file) # Load gene annotation
# Rename columns and remove duplicates
setnames(geneinfo, old = names(geneinfo), new = c("ENSEMBL", "ENSEMBLv", "Description_detailed", "Chr", "Start", "End", "Strand", "GENENAME", "ENTREZID", "Description"))
geneinfo <- geneinfo %>%
select(-ENSEMBLv, -Description_detailed) %>% # Remove unnecessary columns
distinct(ENSEMBL, .keep_all = TRUE) # Remove duplicates based on ENSEMBL
# 5. Merge Gene Annotations with Count Data
annot <- merge(count[, "Genes", drop = FALSE], geneinfo, by.x = "Genes", by.y = "GENENAME", all.x = TRUE)
# 6. Set Row Names for Count Data
count <- count %>%
remove_rownames() %>%
column_to_rownames(var = "Genes") # Set "Genes" column as row names for easier subsetting
# Clean sample names to match metadata
cleaned_names <- gsub("D:\\\\Mass_spectrometry\\\\Raw_data\\\\Joakim\\\\Simon Horse second run2\\\\|_.*", "", colnames(count))
colnames(count) <- cleaned_names
meta$`Sample ID` <- cleaned_names
# 7. Remove Low-Quality Samples from Count and Meta
samples_to_remove <- c("25C", "12A", "22A", "19A") # Specify samples to remove
count <- count[, !colnames(count) %in% samples_to_remove]
meta <- meta %>% filter(!`Sample ID` %in% samples_to_remove)
# 8. Subset Meta and Count to Include Only Terminal Samples
valid_conditions <- c("LA_Metformin_4months", "LA_Control_4months", "RA_Metformin_4months", "RA_Control_4months", "LA_Sham_4months", "RA_Sham_4months")
meta <- meta %>% filter(Condition %in% valid_conditions)
count <- count[, colnames(count) %in% meta$`Sample ID`]
# 9. Calculate Number of Proteins in Each Sample Before Filtering
num_proteins <- colSums(!is.na(count)) # Count non-NA values (proteins) per sample
num_proteins <- num_proteins[meta$`Sample ID`] # Match order with metadata
# Define color scheme for groups based on publication color palette
KUalt <- c("#7C1516", "#285291", "#434343", "#999999")
group_colors <- setNames(KUalt[1:length(unique(meta$Group))], unique(meta$Group))
bar_colors <- group_colors[meta$Group]
# Plot: Number of Proteins in Each Sample Before Filtering
par(mfrow = c(1, 1)) # Reset plot layout
barplot(num_proteins,
main = "Number of Proteins in Each Sample\nBefore Filtering",
xlab = "Sample",
ylab = "Number of Proteins",
col = bar_colors,
border = "black",
ylim = c(0, 2500),
las = 2) # Rotate x-axis labels
# Print Total Number of Proteins Before Filtering
cat("Total number of proteins before filtering:", max(num_proteins), "\n")
## Total number of proteins before filtering: 2610
# 10. Identify and Remove Genes with Less Than Three Valid Values in Any Condition
invalid_genes <- unique(unlist(lapply(unique(meta$Condition), function(condition) {
condition_columns <- meta$`Sample ID`[meta$Condition == condition]
condition_count <- count[, condition_columns, drop = FALSE]
rownames(condition_count)[rowSums(!is.na(condition_count)) < 3]
})))
# Remove Invalid Genes from Count Data
filtered_count <- count[!rownames(count) %in% invalid_genes, ]
# 11. Calculate Number of Proteins in Each Sample After Filtering
num_proteins_after <- colSums(!is.na(filtered_count))
num_proteins_after <- num_proteins_after[meta$`Sample ID`] # Ensure matching order with metadata
# Plot Settings for Before and After Comparison
par(mfrow = c(1, 2)) # Set plot layout to 1 row and 2 columns
# Plot 1: Number of Proteins Before Filtering
barplot(num_proteins,
main = "Number of Proteins in Each Sample\nBefore Filtering",
xlab = "Sample",
ylab = "Number of Proteins",
col = bar_colors,
border = "black",
ylim = c(0, 2500),
las = 2)
# Plot 2: Number of Proteins After Filtering
barplot(num_proteins_after,
main = "Number of Proteins in Each Sample\nAfter Filtering",
xlab = "Sample",
ylab = "Number of Proteins",
col = bar_colors,
border = "black",
ylim = c(0, 2500),
las = 2)
# Print Total and Average Number of Proteins Before and After Filtering
cat("Total number of proteins before filtering:", max(num_proteins), "\n")
## Total number of proteins before filtering: 2610
cat("Average number of proteins before filtering:", mean(num_proteins), "\n")
## Average number of proteins before filtering: 2435.851
cat("Total number of proteins after filtering:", max(num_proteins_after), "\n")
## Total number of proteins after filtering: 2078
cat("Average number of proteins after filtering:", mean(num_proteins_after), "\n")
## Average number of proteins after filtering: 2005.426
# Define the raw count data as `unfiltered_count`
unfiltered_count <- count
# Step 1: Create annotation columns for the DEP package, using actual data if available
# If you don't have annotation data, these can remain as NA placeholders.
proxy_columns <- c("Protein.IDs", "Majority.protein.IDs", "Protein.names", "Gene.names",
"Fasta.headers", "Peptides", "Razor...unique.peptides", "Unique.peptides",
"Only.identified.by.site", "Reverse", "Potential.contaminant")
# Create a data frame with NA values for proxy columns and rownames from `unfiltered_count`
proxy_df <- data.frame(matrix(NA, nrow = nrow(unfiltered_count), ncol = length(proxy_columns)))
colnames(proxy_df) <- proxy_columns
# Assign the row names of `unfiltered_count` to the first few proxy columns to have a base
proxy_df[, 1:4] <- lapply(1:4, function(i) rownames(unfiltered_count))
# Step 2: Merge proxy annotations with LFQ data and rename LFQ columns to match DEP format
merged_df <- cbind(proxy_df, unfiltered_count)
colnames(merged_df)[12:ncol(merged_df)] <- paste0("LFQ.intensity.", colnames(unfiltered_count))
# Step 3: Rearrange columns to have LFQ intensities adjacent to annotation columns
lfq_columns <- grep("^LFQ.intensity", colnames(merged_df), value = TRUE) # Find LFQ columns
rearranged_columns <- c(proxy_columns[1:8], lfq_columns, proxy_columns[9:11])
data <- merged_df[, rearranged_columns]
# Check for duplicated gene names (important for SE object creation)
if (any(duplicated(data$Gene.names))) {
message("Warning: There are duplicated gene names in the dataset.")
}
# Step 4: Use `make_unique` to generate unique identifiers for each protein entry
# Using "Gene.names" as primary names and "Protein.IDs" as backup identifiers for uniqueness
data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
# Step 5: Create an experimental design data frame
# Define LFQ intensity columns and set up the experimental design structure
LFQ_columns <- grep("LFQ.", colnames(data_unique)) # Get column numbers for LFQ intensities
# Generate an experimental design based on the metadata (`meta`) from the earlier step
experimental_design <- data.frame(
label = colnames(unfiltered_count), # Column names from unfiltered data as labels
condition = meta$Condition, # Condition information from metadata
replicate = NA # Initialize a placeholder column for replicates
)
# Assign replicate numbers within each condition group
experimental_design <- experimental_design %>%
group_by(condition) %>%
mutate(replicate = row_number()) %>% # Assign replicate numbers within each condition
ungroup()
# Ensure row names of the experimental design match the column names of `data_unique`
row.names(experimental_design) <- experimental_design$label
## Warning: Setting row names on a tibble is deprecated.
# Print experimental design for verification
print(experimental_design)
## # A tibble: 47 x 3
## label condition replicate
## * <chr> <chr> <int>
## 1 1B RA_Metformin_4months 1
## 2 1C LA_Metformin_4months 1
## 3 2B RA_Control_4months 1
## 4 2C LA_Control_4months 1
## 5 3B RA_Metformin_4months 2
## 6 3C LA_Metformin_4months 2
## 7 4B RA_Sham_4months 1
## 8 4C LA_Sham_4months 1
## 9 5B RA_Metformin_4months 3
## 10 5C LA_Metformin_4months 3
## # i 37 more rows
# Step 6: Generate the SummarizedExperiment object using the `make_se` function from DEP
# This function requires unique identifiers for rows and an experimental design matrix
data_se <- make_se(data_unique, LFQ_columns, experimental_design)
# Plot Frequency of Identified Proteins Across Samples
plot_frequency(data_se)
# Note: No additional filtering is performed here as the initial filtering was conducted manually before creating the SE object
# Visualizes the number of proteins identified in each sample, providing insight into sample quality.
plot_numbers(data_se)
# Displays the overlap in protein identifications across all samples, which helps assess batch effects or discrepancies.
plot_coverage(data_se)
# Filter Proteins Based on Missing Values Across Replicates
# Filters for proteins that are identified in all replicates of at least one condition.
# This reduces the dataset to proteins with consistent detection, improving downstream analysis reliability.
# Set `thr = 0` to filter out proteins with missing values in all replicates.
data_filt <- filter_missval(data_se, thr = 0)
# Visualize the Frequency of Identified Proteins After Filtering
# Check how the distribution of protein frequency has changed after filtering out proteins with missing values.
plot_frequency(data_filt)
# Visualize the Number of Proteins Identified Per Sample After Filtering
plot_numbers(data_filt)
# Visualize Protein Identification Coverage Between Samples After Filtering
plot_coverage(data_filt)
# Number of proteins before filtering
num_proteins_before <- nrow(assay(data_se))
cat("Number of proteins before filtering:", num_proteins_before, "\n")
## Number of proteins before filtering: 2967
# Number of proteins after filtering
num_proteins_after <- nrow(assay(data_filt))
cat("Number of proteins after filtering:", num_proteins_after, "\n")
## Number of proteins after filtering: 2372
# Extract the number of proteins identified per sample before filtering
num_proteins_before <- colSums(!is.na(assay(data_se))) # Use data_se for the unfiltered data
# Data frame for the number of proteins before filtering
protein_data_before <- data.frame(
Sample = names(num_proteins_before),
Number_of_Proteins = num_proteins_before,
Group = meta$Group # Add the group information from your metadata
)
# Extract the number of proteins identified per sample before filtering
num_proteins_after <- colSums(!is.na(assay(data_filt))) # Use data_se for the unfiltered data
# Data frame for the number of proteins after filtering
protein_data_after <- data.frame(
Sample = names(num_proteins_after),
Number_of_Proteins = num_proteins_after,
Group = meta$Group # Add the group information from your metadata
)
# Plot for before filtering
ggplot(protein_data_before, aes(x = Sample, y = Number_of_Proteins, fill = Group)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_manual(values = bar_colors) +
labs(
title = "Number of Proteins in Each Sample\nBefore Filtering",
x = "Sample",
y = "Number of Proteins"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
ylim(0, 3000) # Fix y-axis limit
# Plot for after filtering
ggplot(protein_data_after, aes(x = Sample, y = Number_of_Proteins, fill = Group)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_manual(values = bar_colors) +
labs(
title = "Number of Proteins in Each Sample\nAfter Filtering",
x = "Sample",
y = "Number of Proteins"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
ylim(0, 3000) # Fix y-axis limit
## Normalizaiton
#Check un-normalized data
plot_normalization(data_filt)
# Apply Variance-Stabilizing Normalization (VSN)
# Normalize the filtered data to stabilize variance across the range of intensities.
# VSN is commonly used for proteomics data to handle high dynamic range and heteroscedasticity.
data_filt_norm <- normalize_vsn(data_filt)
# Plot VSN-Normalized Data
# After normalization, this plot helps assess whether the variance has been stabilized.
# A successful normalization will show that the mean-variance relationship is approximately constant.
meanSdPlot(data_filt_norm, rank = TRUE)
plot_normalization(data_filt_norm)
# Verdict: The mean-SD plot shows that after variance stabilization, the median (which serves as a reasonable estimator of the standard deviation of feature-level data
# conditional on the mean) is approximately a horizontal line. This indicates that the normalization has successfully stabilized the variance, and the data is suitable for downstream differential analysis.
# Visualize Proteins with Missing Values
plot_missval(data_filt_norm)
# Visualize Intensity Distributions and Cumulative Fraction of Proteins
plot_detect(data_filt_norm)
# Typically, missing values in proteomics data are "Missing Not At Random" (MNAR),
# meaning proteins with missing values often have lower intensities and are close to the detection limit.
# For MNAR data, appropriate imputation methods include left-censored imputation techniques such as:
# - Quantile Regression-based Left-Censored (QRILC)
# - Random draws from a left-shifted distribution ("MinProb" or "man")
# Impute missing data using random draws from a Gaussian distribution centered around a minimal value (for MNAR)
data_imp_MinProb <- impute(data_filt_norm, fun = "MinProb", q = 0.01)
## Imputing along margin 2 (samples/columns).
## [1] 0.489567
# Impute missing data using random draws from a manually defined left-shifted Gaussian distribution (for MNAR)
data_imp_man <- impute( data_filt_norm, fun = "man", shift = 1.8, scale = 0.3)
# Impute missing data using Quantile Regression Imputation of Left-Censored Data
# QRILC is particularly useful when missing values are assumed to be low or below the detection limit.
data_imp_qrilc <- impute( data_filt_norm, fun = "QRILC")
## Imputing along margin 2 (samples/columns).
# Plot intensity distributions before and after imputation
# This plot helps assess the effectiveness of each imputation method by visualizing the changes in intensity distributions.
plot_imputation( data_filt_norm, data_imp_MinProb, data_imp_man, data_imp_qrilc)
### Verdict: After visual inspection, it appears that the MinProb handles the MNAR nature of the data best. and will be used downstream.
# Transform raw count data and remove NA values
logCounts <- log2(count + 1) # Add 1 to avoid log(0) issues
logCounts_noNA <- na.omit(logCounts)
# Create DGEList object and define the design matrix
annot_reordered <- annot[match(rownames(logCounts_noNA), annot$Genes), ]
d <- DGEList(counts = logCounts_noNA, genes = annot_reordered, samples = meta)
design <- model.matrix(~0 + Condition, data = d$samples)
# 1. MDS Plot for Raw Data
mds_raw <- plotMDS(logCounts_noNA, plot = FALSE)
mds_plot_raw <- aamisc::ggMDS(mds = mds_raw, meta = d$samples, color.by = "Group", shape.by = "Region", legend.position = "right", text.by = "Horse", text.size = 1.5) +
scale_color_manual(values = publication_colors) +
labs(title = "MDS: Raw Data", x = "MDS Dimension 1", y = "MDS Dimension 2") +
theme(legend.title = element_text(face = "bold"), plot.title = element_text(face = "bold", hjust = 0.5))
# 3. MDS Plot for Data Adjusted Using ComBat
combat_data <- ComBat(dat = logCounts_noNA, batch = as.factor(d$samples$Batch), mod = model.matrix(~Condition, data = d$samples))
## Found4batches
## Adjusting for5covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
mds_combat <- plotMDS(combat_data, plot = FALSE)
mds_plot_combat <- aamisc::ggMDS(mds = mds_combat, meta = d$samples, color.by = "Group", shape.by = "Region", legend.position = "right", text.by = "Horse", text.size = 1.5) +
scale_color_manual(values = publication_colors) +
labs(title = "MDS: ComBat Adjusted", x = "MDS Dimension 1", y = "MDS Dimension 2") +
theme(legend.title = element_text(face = "bold"), plot.title = element_text(face = "bold", hjust = 0.5))
# 2. MDS Plot for Data Adjusted by Blocking for Horse
logCounts_blocked <- removeBatchEffect(logCounts_noNA, batch = as.factor(d$samples$Horse), design = design)
## Coefficients not estimable: batch22 batch23
## Warning: Partial NA coefficients for 1350 probe(s)
mds_blocked <- plotMDS(logCounts_blocked, plot = FALSE)
mds_plot_blocked <- aamisc::ggMDS(mds = mds_blocked, meta = d$samples, color.by = "Group", shape.by = "Region", legend.position = "right", text.by = "Horse", text.size = 1.5) +
scale_color_manual(values = publication_colors) +
labs(title = "MDS: Blocked for Horse", x = "MDS Dimension 1", y = "MDS Dimension 2") +
theme(legend.title = element_text(face = "bold"), plot.title = element_text(face = "bold", hjust = 0.5))
# 4. MDS Plot for Data Adjusted Using SVA
mod <- model.matrix(~0 + Condition, data = d$samples)
mod0 <- model.matrix(~1, data = d$samples)
n.sv <- num.sv(logCounts_noNA, mod, method = "be")
logCounts_noNA <- as.matrix(logCounts_noNA)
storage.mode(logCounts_noNA) <- "numeric"
svobj <- sva(logCounts_noNA, mod, mod0, n.sv = n.sv)
## Number of significant surrogate variables is: 8
## Iteration (out of 5 ):1 2 3 4 5
modSv <- cbind(mod, svobj$sv)
logCounts_sva <- removeBatchEffect(logCounts_noNA, batch = d$samples$Horse, design = modSv)
## Coefficients not estimable: batch22 batch23
## Warning: Partial NA coefficients for 1350 probe(s)
mds_sva <- plotMDS(logCounts_sva, plot = FALSE)
mds_plot_sva <- aamisc::ggMDS(mds = mds_sva, meta = d$samples, color.by = "Group", shape.by = "Region", legend.position = "right") +
scale_color_manual(values = publication_colors) +
labs(title = "MDS: SVA Adjusted", x = "MDS Dimension 1", y = "MDS Dimension 2") +
theme(legend.title = element_text(face = "bold"), plot.title = element_text(face = "bold", hjust = 0.5))
# 5. MDS Plot for Data Adjusted Using SVA (with Labels)
mds_plot_sva_labelled <- aamisc::ggMDS(mds = mds_sva, meta = d$samples, color.by = "Group", shape.by = "Region", legend.position = "right", text.by = "Horse", text.size = 1.5) +
scale_color_manual(values = publication_colors) +
labs(title = "MDS: SVA Adjusted (Labelled)", x = "MDS Dimension 1", y = "MDS Dimension 2") +
theme(legend.title = element_text(face = "bold"), plot.title = element_text(face = "bold", hjust = 0.5))
# Display the plots
print(mds_plot_raw)
print(mds_plot_combat)
print(mds_plot_blocked)
print(mds_plot_sva)
print(mds_plot_sva_labelled)
# Note: This analysis is intended as an extra quality control (QC) step.
# The main differential abundance analysis is performed using the `limma` package due to the more complex study design.
# Here, DEP is used to test for differential protein abundance in a straightforward manner.
# Define contrasts for differential analysis
# The contrasts compare different conditions (e.g., Metformin vs Control) within each region (RA/LA).
contrasts <- c("RA_Control_4months_vs_RA_Sham_4months",
"LA_Control_4months_vs_LA_Sham_4months",
"RA_Metformin_4months_vs_RA_Control_4months",
"LA_Metformin_4months_vs_LA_Control_4months")
# Perform Differential Abundance Analysis Using the Specified Contrasts
# This analysis is based on the imputed data (`data_imp_MinProb`) and compares protein abundance between the conditions.
# Since the imputation may affect the statistical results, it's best to cross-check results with non-imputed data if feasible.
data_diff <- test_diff(data_imp_MinProb, type = "manual", test = contrasts)
## Tested contrasts: RA_Control_4months_vs_RA_Sham_4months, LA_Control_4months_vs_LA_Sham_4months, RA_Metformin_4months_vs_RA_Control_4months, LA_Metformin_4months_vs_LA_Control_4months
# Denote Significant Proteins Based on User-Defined Cutoffs
# Mark proteins as significant if they meet the defined criteria: FDR (alpha) < 0.05 and log fold change (lfc) > log2(0).
# Setting lfc = log2(0) indicates we are testing for any change, regardless of direction or magnitude.
dep <- add_rejections(data_diff, alpha = 0.05, lfc = log2(0))
# Plot Volcano Plots for Each Contrast
# Volcano plots show the relationship between fold change and significance for each contrast.
# Proteins with large fold changes and high significance (low p-values) are highlighted.
plot_volcano(dep, contrast = "RA_Control_4months_vs_RA_Sham_4months", label_size = 2, add_names = TRUE)
plot_volcano(dep, contrast = "LA_Control_4months_vs_LA_Sham_4months", label_size = 2, add_names = TRUE)
plot_volcano(dep, contrast = "RA_Metformin_4months_vs_RA_Control_4months", label_size = 2, add_names = TRUE)
plot_volcano(dep, contrast = "LA_Metformin_4months_vs_LA_Control_4months", label_size = 2, add_names = TRUE)
# Plot Frequency of Significant Proteins Across Conditions
# This plot summarizes the number of significant proteins detected for each condition,
# providing an overview of the differential abundance results.
plot_cond(dep)
# Generate and View Results Table
# Extract results from the DEP object into a structured data frame for further analysis.
data_results <- get_results(dep)
# Display the First Few Rows of the Results Table (optional)
head(data_results)
## name ID LA_Control_4months_vs_LA_Sham_4months_p.val
## 1 APPL1 APPL1 1.396486e-03
## 2 CDK5 CDK5 7.790937e-02
## 3 LUC7L2 LUC7L2 1.511547e-06
## 4 A1BG A1BG 3.555154e-01
## 5 A2M A2M 7.588705e-01
## 6 AACS AACS 4.963733e-01
## LA_Metformin_4months_vs_LA_Control_4months_p.val
## 1 4.152448e-05
## 2 2.608382e-04
## 3 1.084665e-06
## 4 8.290347e-01
## 5 7.191834e-01
## 6 3.417125e-01
## RA_Control_4months_vs_RA_Sham_4months_p.val
## 1 0.5920144
## 2 0.3610108
## 3 0.7467261
## 4 0.6960744
## 5 0.4805678
## 6 0.3874781
## RA_Metformin_4months_vs_RA_Control_4months_p.val
## 1 0.2993180
## 2 0.2104453
## 3 0.4676201
## 4 0.8899843
## 5 0.4020839
## 6 0.1884397
## LA_Control_4months_vs_LA_Sham_4months_p.adj
## 1 0.4830
## 2 0.8640
## 3 0.0304
## 4 0.9500
## 5 0.9700
## 6 0.9600
## LA_Metformin_4months_vs_LA_Control_4months_p.adj
## 1 2.52e-03
## 2 2.77e-02
## 3 9.54e-06
## 4 9.74e-01
## 5 9.70e-01
## 6 9.36e-01
## RA_Control_4months_vs_RA_Sham_4months_p.adj
## 1 0.998
## 2 0.998
## 3 0.999
## 4 0.999
## 5 0.998
## 6 0.998
## RA_Metformin_4months_vs_RA_Control_4months_p.adj
## 1 0.986
## 2 0.978
## 3 0.992
## 4 0.996
## 5 0.990
## 6 0.974
## LA_Control_4months_vs_LA_Sham_4months_significant
## 1 FALSE
## 2 FALSE
## 3 TRUE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## LA_Metformin_4months_vs_LA_Control_4months_significant
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## RA_Control_4months_vs_RA_Sham_4months_significant
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## RA_Metformin_4months_vs_RA_Control_4months_significant significant
## 1 FALSE TRUE
## 2 FALSE TRUE
## 3 FALSE TRUE
## 4 FALSE FALSE
## 5 FALSE FALSE
## 6 FALSE FALSE
## LA_Control_4months_vs_LA_Sham_4months_ratio
## 1 -1.480
## 2 -0.703
## 3 -1.910
## 4 -0.278
## 5 0.107
## 6 -0.194
## LA_Metformin_4months_vs_LA_Control_4months_ratio
## 1 1.3400
## 2 1.0500
## 3 1.3200
## 4 0.0439
## 5 0.0850
## 6 0.1850
## RA_Control_4months_vs_RA_Sham_4months_ratio
## 1 -0.210
## 2 -0.323
## 3 0.101
## 4 -0.105
## 5 0.221
## 6 -0.222
## RA_Metformin_4months_vs_RA_Control_4months_ratio LA_Control_4months_centered
## 1 -0.3100 -1.2200
## 2 -0.3360 -1.1200
## 3 -0.1710 -1.2300
## 4 -0.0281 -0.0509
## 5 -0.1990 -0.0162
## 6 0.2560 -0.1320
## LA_Metformin_4months_centered LA_Sham_4months_centered
## 1 0.12300 0.2610
## 2 -0.07400 -0.4210
## 3 0.09220 0.6810
## 4 -0.00701 0.2270
## 5 0.06880 -0.1230
## 6 0.05300 0.0623
## RA_Control_4months_centered RA_Metformin_4months_centered
## 1 0.518 0.2080
## 2 0.638 0.3020
## 3 0.476 0.3050
## 4 -0.010 -0.0381
## 5 0.113 -0.0856
## 6 -0.119 0.1380
## RA_Sham_4months_centered
## 1 0.7280
## 2 0.9620
## 3 0.3760
## 4 0.0951
## 5 -0.1080
## 6 0.1030
# Calculate and Print the Number of Significant Proteins
# This step helps evaluate the number of differentially abundant proteins detected at the given cutoffs.
num_significant_proteins <- data_results %>% filter(significant) %>% nrow()
cat("Number of significant proteins detected across all contrasts:", num_significant_proteins, "\n")
## Number of significant proteins detected across all contrasts: 3
# Prepare Data for Histogram Visualization
# Histograms will be generated for four datasets:
# 1. `normalized_unimputed`: Normalized but not imputed data.
# 2. `imputed_data_MinProb`: Data imputed using the "MinProb" method.
# 3. `imputed_data_qrilc`: Data imputed using the "QRILC" method.
# 4. `unfiltered_count`: Raw log-transformed counts after initial filtering.
# Extract the assay data from data_filt_norm (normalized_unimputed)
normalized_unimputed <- as.data.frame(assay(data_filt_norm))
colnames(normalized_unimputed) <- colnames(unfiltered_count)
rownames(normalized_unimputed) <- rownames(data_filt_norm)
# Extract the assay data from data_imp_MinProb (imputed)
imputed_data_MinProb <- as.data.frame(assay(data_imp_MinProb))
colnames(imputed_data_MinProb) <- colnames(unfiltered_count)
rownames(imputed_data_MinProb) <- rownames(data_filt_norm)
# Extract the assay data from data_imp_qrilc
imputed_data_qrilc <- as.data.frame(assay(data_imp_qrilc))
colnames(imputed_data_qrilc) <- colnames(unfiltered_count)
rownames(imputed_data_qrilc) <- rownames(data_filt_norm)
# Plot histogram for normalized data
par(mfrow = c(1, 4))
hist(as.vector(as.matrix(normalized_unimputed)), breaks = 50, main = "Normalized Data", xlab = "Intensity")
hist(as.vector(as.matrix(imputed_data_MinProb)), breaks = 50, main = "MinProb Imputed Data", xlab = "Intensity")
hist(as.vector(as.matrix(imputed_data_qrilc)), breaks = 50, main = "QRILC Imputed Data", xlab = "Intensity")
hist(as.vector(as.matrix(unfiltered_count)), breaks = 50, main = "Filtered Data", xlab = "Intensity")
# Define the output folder path
output_folder <- "../../../../Terminal/analysis/01_dge/output/"
# Function to save a matrix with row names as a separate column
save_matrix <- function(matrix, file_path) {
# Create a data frame from the matrix, including row names as a column
matrix_df <- as.data.frame(matrix)
matrix_df$GeneName <- rownames(matrix)
fwrite(matrix_df, file = file_path, sep = "\t", row.names = FALSE)
}
# Function to load a matrix and restore row names
load_matrix <- function(file_path) {
matrix_df <- fread(file_path, data.table = FALSE)
rownames(matrix_df) <- matrix_df$GeneName
matrix_df$GeneName <- NULL
return(as.matrix(matrix_df)) # Convert back to a matrix
}
# # Save the matrices
# save_matrix(normalized_unimputed, paste0(output_folder, "normalized_unimputed.tsv"))
# save_matrix(imputed_data_MinProb, paste0(output_folder, "imputed_data_MinProb.tsv"))
# save_matrix(imputed_data_qrilc, paste0(output_folder, "imputed_data_qrilc.tsv"))
# Load the matrices in the future
normalized_unimputed <- load_matrix(paste0(output_folder, "normalized_unimputed.tsv"))
imputed_data_MinProb <- load_matrix(paste0(output_folder, "imputed_data_MinProb.tsv"))
imputed_data_qrilc <- load_matrix(paste0(output_folder, "imputed_data_qrilc.tsv"))
# Define the mapping from old names to new names
# rename_mapping <- c(
# "1B" = "M1_RA", "1C" = "M1_LA",
# "2B" = "M2_RA", "2C" = "M2_LA",
# "3B" = "M3_RA", "3C" = "M3_LA",
# "4B" = "M4_RA", "4C" = "M4_LA",
# "5B" = "M5_RA", "5C" = "M5_LA",
# "6B" = "M6_RA", "6C" = "M6_LA",
# "7B" = "M7_RA", "7C" = "M7_LA",
# "8B" = "M8_RA", "8C" = "M8_LA",
# "9B" = "M9_RA", "9C" = "M9_LA",
# "10B" = "M10_RA", "10C" = "M10_LA",
# "11B" = "M11_RA", "11C" = "M11_LA",
# "12B" = "M12_RA", "12C" = "M12_LA",
# "13B" = "M13_RA", "13C" = "M13_LA",
# "14B" = "M14_RA", "14C" = "M14_LA",
# "15B" = "M15_RA", "15C" = "M15_LA",
# "16B" = "M16_RA", "16C" = "M16_LA",
# "17B" = "M17_RA", "17C" = "M17_LA",
# "18B" = "M18_RA", "18C" = "M18_LA",
# "19B" = "M19_RA", "19C" = "M19_LA",
# "20B" = "M20_RA", "20C" = "M20_LA",
# "22B" = "M22_RA", "22C" = "M22_LA",
# "23B" = "M23_RA", "23C" = "M23_LA",
# "24B" = "M24_RA", "24C" = "M24_LA",
# "25B" = "M25_RA"
# )
#
#
# # # Rename the columns using the mapping
# norm_proteomics <- imputed_data_MinProb %>% rename_with(~ rename_mapping[.x], everything())
# norm_proteomics <- as.data.frame(norm_proteomics)
#
# #Save the Normalized Proteomics Data for Integration
# # Save the renamed and imputed proteomics data as a text file for further integration with other omics data.
# # Note: The line below is commented out to avoid overwriting files unintentionally.
# write.table(norm_proteomics, file="../../../../../Proteomics/Terminal/analysis/01_dge/output/proteomics_data.txt", sep="\t", col.names=NA, quote=FALSE)
#
# # Step 4: Save the QRILC Imputed Data as an RDS Object for Further Analysis
# # Save the `data_imp_qrilc` object as an RDS file for use in downstream applications.
# # This is useful for restoring the exact imputed data structure later without re-running the imputation process.
# saveRDS(data_imp_qrilc, file = "../../../../../Proteomics/Terminal/analysis/01_dge/output/data_imp_MinProb.rds")
# #Set the design
# design <- model.matrix(~0 + Condition + Batch, d$samples)
# colnames(design) <- gsub("Condition", "", colnames(design))
#
# con <- makeContrasts(AF_vs_Sham_RA = RA_Control_4months - RA_Sham_4months,
# AF_vs_Sham_LA = LA_Control_4months - LA_Sham_4months,
# Metformin_vs_AF_RA = RA_Metformin_4months - RA_Control_4months,
# Metformin_vs_AF_LA = LA_Metformin_4months - LA_Control_4months,
# RA_vs_LA_Sham = RA_Sham_4months - LA_Sham_4months,
# RA_vs_LA_AF = RA_Control_4months - LA_Control_4months,
# RA_vs_LA_Metformin = RA_Metformin_4months - LA_Metformin_4months,
# InteractionEffect = (RA_Sham_4months - RA_Control_4months) - (LA_Sham_4months - LA_Control_4months),
# AverageDiseaseEffect = (RA_Control_4months + LA_Control_4months)/2 - (LA_Sham_4months + RA_Sham_4months)/2,
# AverageTreatmentEffect = (RA_Metformin_4months + LA_Metformin_4months)/2 - (LA_Control_4months + RA_Control_4months)/2,
# AverageRegionEffect = (RA_Sham_4months + RA_Control_4months)/2 - (LA_Sham_4months + LA_Control_4months)/2,
# levels = design)
# con
#
#
# #Run limma whilst blocking for "horse"/Estimate Correlation due to Repeated Measures (Blocking for "Horse")
# # Correlation within horses is accounted for using `duplicateCorrelation`.
# corfit <- duplicateCorrelation(imputed_data_qrilc, design, block = as.factor(d$samples$Horse))
# fit <- lmFit(imputed_data_qrilc, design, block = as.factor(d$samples$Horse), correlation = corfit$consensus.correlation)
# rownames(fit$coefficients) <- rownames(imputed_data_qrilc) # Retain row names for downstream mapping
#
# # Obtain DGE results with FDR correction using fdrtool
# res <- list() # list for DGE results
# for (i in colnames(con)) {
# fit.contrast <- contrasts.fit(fit, contrasts = con)
# fit.contrast <- eBayes(fit.contrast, robust = TRUE, trend = TRUE) # Fit contrasts and apply Empirical Bayes moderation
# res_tmp <- topTable(fit.contrast, coef = i, adjust.method = "BH", number = Inf)
# res_tmp <- res_tmp[!is.na(res_tmp$t), ] # Remove NA values
#
# # Apply FDR correction using fdrtool
# fdr_res <- fdrtool(res_tmp$t, plot = FALSE, verbose = FALSE)
# res_tmp$qval <- fdr_res$qval # FDR-corrected q-values
# res_tmp$lfdr <- fdr_res$lfdr # Local FDR values
# res_tmp$Contrast <- rep(i, nrow(res_tmp)) # Store contrast identifier
# res[[i]] <- data.frame(res_tmp) # Append results to the list
#
# ## We use the fdrtool package for FDR correction to align with the DEP package and ensure consistency in our proteomics analysis workflow. This method models the distribution of test statistics directly, providing both q-values and local false discovery rates (lfdr) for a more refined control of type I error.
#
# # Print the number of differentially abundant genes
# n_qval <- res_tmp %>% filter(qval < 0.05) %>% nrow()
# n_adj_pval <- res_tmp %>% filter(adj.P.Val < 0.05) %>% nrow()
# print(paste('Number of differentially abundant genes for', i, 'based on q-value (FDR) =', n_qval))
# print(paste('Number of differentially abundant genes for', i, 'based on adjusted p-value =', n_adj_pval))
# }
# res_all <- do.call(rbind, res) # Combine results across all contrasts
#
# # Map Gene Names Manually
# res_all$GeneName <- sapply(seq_len(nrow(res_all)), function(i) {
# gsub(paste0("^", res_all$Contrast[i], "\\."), "", rownames(res_all)[i])
# })
# res_split <- split(res_all, res_all$Contrast) # Split the combined results based on contrast for easier output
#
# # Create output Excel file
# # Commented out
# # openxlsx::write.xlsx(x = res_split, file = "../../../../Terminal/analysis/01_dge/output/dge_results.xlsx", asTable = TRUE)
#
# # Create output TSV file
# # Commeneted out
# # data.table::fwrite(x = res_all, file = "../../../../Terminal/analysis/01_dge/output/dge_results.tsv.gz", sep = "\t")
# TODO: Ali when I use Combat and SVA my pvalue histograms gets much nicer, despite the fact that the batch-effect ("season") and horseIDs are nested, and the corfit is almost 0.
# TODO: Ali's response >>>
# 1) You might be over-sanding your data with all batch adjustment procedures. ComBat correct for HorseIDs, SVA corrects for unknown systematic variations.
# 2) Modelling the batch effect is always a way better option than removing it. I avoid using ComBat in majority of the analysis I perform.
#. Regarding the points above, please read: # https://academic.oup.com/biostatistics/article/17/1/29/1744261
# VERDICT: don't use ComBat
# Extract batch and biological condition information
batch <- d$samples$Batch # Specify the batch variable
condition <- d$samples$Condition # Specify the biological condition
# Run ComBat to correct for batch effects
combat_data <- ComBat(dat = imputed_data_MinProb, batch = batch, mod = model.matrix(~ condition))
## Found4batches
## Adjusting for5covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# Check if the batch effect is removed using PCA or similar methods
# PCA to visualize the data before and after batch correction (optional)
# You can run a PCA analysis here to check the effectiveness of ComBat
# Proceed with SVA to adjust for hidden confounders
mod <- model.matrix(~ 0 + condition, data = d$samples) # Full model
mod0 <- model.matrix(~ 1, data = d$samples) # Null model
n.sv <- num.sv(combat_data, mod, method = "be") # Estimate number of surrogate variables
svobj <- sva(combat_data, mod, mod0, n.sv = n.sv)
## Number of significant surrogate variables is: 7
## Iteration (out of 5 ):1 2 3 4 5
# Generate pairwise scatter plots of surrogate variables
sv_plots <- list() # Store plots
sv_data <- as.data.frame(svobj$sv)
colnames(sv_data) <- paste0("SV", seq_len(ncol(svobj$sv)))
sv_data <- cbind(d$samples, sv_data)
sv_cols <- colnames(sv_data)[grep("^SV", colnames(sv_data))]
for (i in seq_along(sv_cols)) {
for (j in seq_along(sv_cols)) {
if (i < j) { # Plot each pair only once
sv_plots[[paste0("SV", i, "_SV", j)]] <- ggplot(sv_data, aes_string(x = sv_cols[i], y = sv_cols[j],
color = "Group")) +
geom_point(size = 3, alpha = 0.8) +
theme_minimal() +
labs(title = paste("Surrogate Variables:", sv_cols[i], "vs", sv_cols[j]),
x = paste("Surrogate Variable", i),
y = paste("Surrogate Variable", j))
}
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## i Please use tidy evaluation idioms with `aes()`.
## i See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
for (plot_name in names(sv_plots)) {
print(sv_plots[[plot_name]])
}
cat("None of the surrogate variables captured variability related to disease group.\n")
## None of the surrogate variables captured variability related to disease group.
# Adjust the design matrix to include surrogate variables
modSv <- cbind(mod, svobj$sv)
# Run duplicateCorrelation to account for repeated measures (e.g., "Horse" as a blocking factor)
corfit <- duplicateCorrelation(combat_data, modSv, block = as.factor(d$samples$Horse))
# Fit the linear model using limma, accounting for repeated measures
fit <- lmFit(combat_data, modSv, block = as.factor(d$samples$Horse), correlation = corfit$consensus.correlation)
cat("Consensus correlation for repeated measures:", corfit$consensus.correlation, "\n") # 0.0025
## Consensus correlation for repeated measures: 0.002543767
# Ensure all column names in modSv are syntactically valid
colnames(modSv) <- make.names(colnames(modSv))
# Check the updated column names to use in the makeContrasts function
print(colnames(modSv))
## [1] "conditionLA_Control_4months" "conditionLA_Metformin_4months"
## [3] "conditionLA_Sham_4months" "conditionRA_Control_4months"
## [5] "conditionRA_Metformin_4months" "conditionRA_Sham_4months"
## [7] "X" "X"
## [9] "X" "X"
## [11] "X" "X"
## [13] "X"
# Define contrasts with the updated column names
con <- makeContrasts(
AF_vs_Sham_RA = conditionRA_Control_4months - conditionRA_Sham_4months,
AF_vs_Sham_LA = conditionLA_Control_4months - conditionLA_Sham_4months,
Metformin_vs_AF_RA = conditionRA_Metformin_4months - conditionRA_Control_4months,
Metformin_vs_AF_LA = conditionLA_Metformin_4months - conditionLA_Control_4months,
RA_vs_LA_Sham = conditionRA_Sham_4months - conditionLA_Sham_4months,
RA_vs_LA_AF = conditionRA_Control_4months - conditionLA_Control_4months,
RA_vs_LA_Metformin = conditionRA_Metformin_4months - conditionLA_Metformin_4months,
InteractionEffect = (conditionRA_Sham_4months - conditionRA_Control_4months) - (conditionLA_Sham_4months - conditionLA_Control_4months),
AverageDiseaseEffect = (conditionRA_Control_4months + conditionLA_Control_4months)/2 - (conditionRA_Sham_4months + conditionLA_Sham_4months)/2,
AverageTreatmentEffect = (conditionRA_Metformin_4months + conditionLA_Metformin_4months)/2 - (conditionRA_Control_4months + conditionLA_Control_4months)/2,
AverageRegionEffect = (conditionRA_Sham_4months + conditionRA_Control_4months)/2 - (conditionLA_Sham_4months + conditionLA_Control_4months)/2,
levels = modSv
)
# Apply contrasts and run eBayes
fit <- contrasts.fit(fit, con)
## Warning in contrasts.fit(fit, con): row names of contrasts don't match col
## names of coefficients
fit <- eBayes(fit, robust = TRUE, trend = TRUE)
# Extract DGE results using the BH method for FDR correction
res <- list() # List to store DGE results
for (i in colnames(con)) {
res_tmp <- topTable(fit, coef = i, adjust.method = "BH", number = Inf) # Get top table results
res_tmp <- res_tmp[!is.na(res_tmp$t), ] # Remove rows with NA values
res_tmp$Contrast <- rep(i, nrow(res_tmp)) # Store the contrast name
res[[i]] <- res_tmp # Add to the results list
# Print the number of differentially expressed genes based on adjusted p-values
n_adj_pval <- nrow(res_tmp[res_tmp$adj.P.Val < 0.05, ])
print(paste('Number of differentially expressed genes for', i, 'based on adjusted p-value (BH) =', n_adj_pval))
}
## [1] "Number of differentially expressed genes for AF_vs_Sham_RA based on adjusted p-value (BH) = 38"
## [1] "Number of differentially expressed genes for AF_vs_Sham_LA based on adjusted p-value (BH) = 98"
## [1] "Number of differentially expressed genes for Metformin_vs_AF_RA based on adjusted p-value (BH) = 0"
## [1] "Number of differentially expressed genes for Metformin_vs_AF_LA based on adjusted p-value (BH) = 3"
## [1] "Number of differentially expressed genes for RA_vs_LA_Sham based on adjusted p-value (BH) = 25"
## [1] "Number of differentially expressed genes for RA_vs_LA_AF based on adjusted p-value (BH) = 279"
## [1] "Number of differentially expressed genes for RA_vs_LA_Metformin based on adjusted p-value (BH) = 88"
## [1] "Number of differentially expressed genes for InteractionEffect based on adjusted p-value (BH) = 0"
## [1] "Number of differentially expressed genes for AverageDiseaseEffect based on adjusted p-value (BH) = 181"
## [1] "Number of differentially expressed genes for AverageTreatmentEffect based on adjusted p-value (BH) = 4"
## [1] "Number of differentially expressed genes for AverageRegionEffect based on adjusted p-value (BH) = 210"
# Combine all results into a single data frame
res_all <- do.call(rbind, res)
# Map Gene Names Manually
res_all$GeneName <- sapply(seq_len(nrow(res_all)), function(i) {
gsub(paste0("^", res_all$Contrast[i], "\\."), "", rownames(res_all)[i])
})
# Split results by contrast for easier output
res_split <- split(res_all, res_all$Contrast)
# Optionally, save the results to files
# openxlsx::write.xlsx(x = res_split, file = "../../../../Terminal/analysis/01_dge/output/dge_results_sva_combat.xlsx", asTable = TRUE)
# data.table::fwrite(x = res_all, file = "../../../../Terminal/analysis/01_dge/output/dge_results_sva.tsv.gz", sep = "\t")
# Loop through each data frame in res_split to create p-value histograms
for (contrast in names(res_split)) {
# Extract the P.Values for the current contrast
p_values <- res_split[[contrast]]$P.Value
# Create a histogram using ggplot2
ggplot(data = data.frame(P.Value = p_values), aes(x = P.Value)) +
geom_histogram(bins = 50, color = "black", fill = "lightblue") +
theme_minimal() +
labs(
title = paste("P-value Histogram for", contrast),
x = "P-value",
y = "Frequency"
) +
xlim(0, 1) # Set x-axis limits to be between 0 and 1
# Print each histogram
print(last_plot())
}
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
# Verdict: Nice p-value histograms
# In this workflow, SVA is used to remove hidden variance, and a blocking factor
# accounts for variance introduced by repeated measures (e.g., two samples from the same horse).
# Extract biological condition information
condition <- d$samples$Condition # Specify the biological condition
# Apply SVA to Adjust for Hidden Confounders
# TODO: commenting out line below, because it is wrong. See better "mod" below
#mod <- model.matrix(~ condition + batch, data = d$samples)
# TODO: Ali, if I don't say 0+ i get an intercept...
# TODO: Ali's response >>>
# "0 +" literally tells the model to exclude the intercept. It is easier to write contrasts if there is no intercept in the model.
# Otherwise, everything will be relative to intercept and you need to be more careful with contrasts
# For SVA analysis, always use the model with intercept (no "~ 0 + condition") as SVA was made to work with designs including intercept (e.g.: ~ condition). However, you are welcome to use designs with/out in your lmFit.
# TODO: Things to avoid:
# 1) don't provide integer values as your "batch". It should be "factor"
#. 2) remember, your "horseID" is nested under "batch". If you use both in your analysis, you are over-correcting your data.
#
# Full model including condition
mod <- model.matrix(~ condition, data = d$samples)
# Null model excluding biological information
mod0 <- model.matrix(~ 1, data = d$samples)
# TODO: Model for your analysis
design <- model.matrix(~ 0 + condition, data = d$samples)
# Estimate the number of surrogate variables
n.sv <- num.sv(imputed_data_MinProb, mod, method = "leek") #TODO: changed to "leek"
cat("Number of surrogate variables estimated:", n.sv, "\n")
## Number of surrogate variables estimated: 3
# Run SVA to estimate surrogate variables
svobj <- sva(imputed_data_MinProb, mod, mod0, n.sv = n.sv)
## Number of significant surrogate variables is: 3
## Iteration (out of 5 ):1 2 3 4 5
#TODO: in case that you may wanna try WSVA
wsva(y = imputed_data_MinProb, design = design, plot = TRUE, n.sv = 3) #TODO: seems like you have 3 SVs here as well
## SV1 SV2 SV3
## 1B 0.3573535 1.1474525 -1.0522788
## 1C 1.3127950 0.7770749 -1.1625313
## 2B 0.5791452 0.5842395 -0.8958858
## 2C 1.5616750 1.0536587 -0.8207492
## 3B 0.5127452 1.3437503 -1.0360785
## 3C 1.6863279 1.3876892 -1.1366714
## 4B 1.1670473 0.2059838 -0.9372754
## 4C 0.7662689 1.1421322 -1.0992342
## 5B 0.2923547 1.1207862 -1.0994177
## 5C 0.6552365 1.1462547 -0.9414663
## 6B 1.1917682 0.3205883 -0.8949086
## 6C 1.2615031 0.8226331 -1.1337995
## 7B 0.9758074 1.0727959 -1.1282049
## 7C 1.6607710 1.1388520 -0.7977166
## 8B 0.5029769 1.0464044 -1.1183248
## 8C 1.0848284 1.5633195 -0.9422505
## 9B 0.7874354 0.4911280 -0.9356350
## 9C 0.9527177 0.6702592 -0.8201786
## 10B 0.9237626 1.2330104 -1.0979207
## 10C 1.6003251 1.2751311 -1.0358522
## 11B 1.3387899 0.2727189 -0.9036288
## 11C 1.4013273 1.2088147 -0.7546654
## 12B 0.2251933 1.1445020 -1.0157421
## 12C 1.2957909 1.0067182 -1.1569569
## 13B 0.2077571 0.9542325 -0.8599435
## 13C 1.3169174 1.5663913 -0.7872510
## 14B 0.7020483 1.1697629 -1.0968155
## 14C 0.9207996 1.4003561 -0.7980519
## 15B 0.2783642 1.0353342 -0.8895908
## 15C 1.3702209 1.2016177 -1.0724129
## 16B 0.5806434 0.9095181 -1.1583079
## 16C 1.6348403 1.1673293 -0.8041235
## 17B 0.2103786 1.0531883 -0.8636527
## 17C 0.9349427 0.1989457 -1.2040543
## 18B 0.6049955 0.9151372 -0.9391372
## 18C 0.9684483 0.6228160 -0.8733031
## 19B 0.6404562 1.2122941 -1.0835777
## 19C 0.6417960 0.6930854 -1.1340618
## 20B 0.9155340 0.9318626 -0.9706570
## 20C -0.2151408 0.8952029 -0.8108819
## 22B 1.0479556 0.8090472 -1.1727357
## 22C 0.5472921 0.5907384 -1.1142357
## 23B 0.8763166 0.7033742 -0.9187375
## 23C 0.9109478 0.4650324 -0.8789554
## 24B 1.0122172 0.9964258 -1.1824583
## 24C 0.6136991 0.9495684 -1.1228046
## 25B 1.3474387 0.7179762 -0.9376727
# Visualize Surrogate Variables to Ensure No Biological Information is Captured
# Prepare SV data for plotting
sv_data <- as.data.frame(svobj$sv)
colnames(sv_data) <- paste0("SV", seq_len(ncol(svobj$sv)))
sv_data <- cbind(d$samples, sv_data)
# Generate pairwise scatter plots of surrogate variables
sv_plots <- list() # Store plots
sv_cols <- colnames(sv_data)[grep("^SV", colnames(sv_data))]
#TODO: you are doing this too late, why don't you modify it as soon as you read the metadata?
# Convert Batch to a factor
sv_data$Batch <- as.factor(sv_data$Batch)
# Loop to generate pairwise scatter plots for surrogate variables
for (i in seq_along(sv_cols)) {
for (j in seq_along(sv_cols)) {
if (i < j) { # Plot each pair only once
sv_plots[[paste0("SV", i, "_SV", j)]] <- ggplot(sv_data, aes_string(x = sv_cols[i], y = sv_cols[j],
color = "Group", shape = "Batch")) +
geom_point(size = 3, alpha = 0.8) +
theme_minimal() +
labs(title = paste("Surrogate Variables:", sv_cols[i], "vs", sv_cols[j]),
x = paste("Surrogate Variable", i),
y = paste("Surrogate Variable", j))
}
}
}
# Print all scatter plots for manual inspection
for (plot_name in names(sv_plots)) {
print(sv_plots[[plot_name]])
}
cat("None of the surrogate variables captured variability related to disease group.\n")
## None of the surrogate variables captured variability related to disease group.
#TODO: Note the new modSV
# Incorporate Surrogate Variables into the Design Matrix
#modSv <- cbind(mod, svobj$sv)
modSv <- cbind(design, svobj$sv)
# Run duplicateCorrelation to estimate correlation between repeated samples (blocking by horse)
corfit <- duplicateCorrelation(imputed_data_MinProb, modSv, block = as.factor(d$samples$Horse))
cat("Consensus correlation for repeated measures:", corfit$consensus.correlation, "\n")
## Consensus correlation for repeated measures: 0.05900468
# Fit the linear model with limma, accounting for repeated measures
fit <- lmFit(imputed_data_MinProb, modSv, block = as.factor(d$samples$Horse), correlation = corfit$consensus.correlation)
cat("Consensus correlation for repeated measures:", corfit$consensus.correlation, "\n") # 0.09
## Consensus correlation for repeated measures: 0.05900468
# Ensure column names in design matrix are syntactically valid
colnames(modSv) <- make.names(colnames(modSv))
# Define contrasts with the updated column names
con <- makeContrasts(
AF_vs_Sham_RA = conditionRA_Control_4months - conditionRA_Sham_4months,
AF_vs_Sham_LA = conditionLA_Control_4months - conditionLA_Sham_4months,
Metformin_vs_AF_RA = conditionRA_Metformin_4months - conditionRA_Control_4months,
Metformin_vs_AF_LA = conditionLA_Metformin_4months - conditionLA_Control_4months,
RA_vs_LA_Sham = conditionRA_Sham_4months - conditionLA_Sham_4months,
RA_vs_LA_AF = conditionRA_Control_4months - conditionLA_Control_4months,
RA_vs_LA_Metformin = conditionRA_Metformin_4months - conditionLA_Metformin_4months,
InteractionEffect = (conditionRA_Sham_4months - conditionRA_Control_4months) - (conditionLA_Sham_4months - conditionLA_Control_4months),
AverageDiseaseEffect = (conditionRA_Control_4months + conditionLA_Control_4months)/2 - (conditionRA_Sham_4months + conditionLA_Sham_4months)/2,
AverageTreatmentEffect = (conditionRA_Metformin_4months + conditionLA_Metformin_4months)/2 - (conditionRA_Control_4months + conditionLA_Control_4months)/2,
AverageRegionEffect = (conditionRA_Sham_4months + conditionRA_Control_4months)/2 - (conditionLA_Sham_4months + conditionLA_Control_4months)/2,
levels = modSv
)
# Apply contrasts and run eBayes
fit <- contrasts.fit(fit, con)
## Warning in contrasts.fit(fit, con): row names of contrasts don't match col
## names of coefficients
fit <- eBayes(fit, robust = TRUE, trend = TRUE)
# Extract DGE results using the BH method for FDR correction
res <- list() # List to store DGE results
for (i in colnames(con)) {
res_tmp <- topTable(fit, coef = i, adjust.method = "BH", number = Inf) # Get top table results
res_tmp <- res_tmp[!is.na(res_tmp$t), ] # Remove rows with NA values
res_tmp$Contrast <- i #TODO: replaced this part >>> rep(i, nrow(res_tmp)) # Store the contrast name
res[[i]] <- res_tmp # Add to the results list
# Print the number of differentially expressed genes based on adjusted p-values
n_adj_pval <- nrow(res_tmp[res_tmp$adj.P.Val < 0.05, ])
print(paste('Number of differentially expressed genes for', i, 'based on adjusted p-value (BH) =', n_adj_pval))
}
## [1] "Number of differentially expressed genes for AF_vs_Sham_RA based on adjusted p-value (BH) = 4"
## [1] "Number of differentially expressed genes for AF_vs_Sham_LA based on adjusted p-value (BH) = 88"
## [1] "Number of differentially expressed genes for Metformin_vs_AF_RA based on adjusted p-value (BH) = 0"
## [1] "Number of differentially expressed genes for Metformin_vs_AF_LA based on adjusted p-value (BH) = 3"
## [1] "Number of differentially expressed genes for RA_vs_LA_Sham based on adjusted p-value (BH) = 5"
## [1] "Number of differentially expressed genes for RA_vs_LA_AF based on adjusted p-value (BH) = 452"
## [1] "Number of differentially expressed genes for RA_vs_LA_Metformin based on adjusted p-value (BH) = 50"
## [1] "Number of differentially expressed genes for InteractionEffect based on adjusted p-value (BH) = 0"
## [1] "Number of differentially expressed genes for AverageDiseaseEffect based on adjusted p-value (BH) = 65"
## [1] "Number of differentially expressed genes for AverageTreatmentEffect based on adjusted p-value (BH) = 0"
## [1] "Number of differentially expressed genes for AverageRegionEffect based on adjusted p-value (BH) = 217"
# Combine all results into a single data frame
res_all <- do.call(rbind, res)
# Map Gene Names Manually
res_all$GeneName <- sapply(seq_len(nrow(res_all)), function(i) {
gsub(paste0("^", res_all$Contrast[i], "\\."), "", rownames(res_all)[i])
})
# Split results by contrast for easier output
res_split <- split(res_all, res_all$Contrast)
# Optionally, save the results to files
# openxlsx::write.xlsx(x = res_split, file = "../../../../Timecourse/analysis/01_dge/output/dge_results_sva_combat.xlsx", asTable = TRUE)
# data.table::fwrite(x = res_all, file = "../../../../Timecourse/analysis/01_dge/output/dge_results_sva.tsv.gz", sep = "\t")
# Visualize Results with P-value Histograms
for (contrast in names(res_split)) {
p_values <- res_split[[contrast]]$P.Value
ggplot(data = data.frame(P.Value = p_values), aes(x = P.Value)) +
geom_histogram(breaks = seq(0, 1, by = 0.05), color = "black", fill = "lightblue") +
theme_minimal() +
labs(title = paste("P-value Histogram for", contrast),
x = "P-value",
y = "Frequency") +
xlim(0, 1) # Set x-axis limits
print(last_plot())
}
volcano_plots <- list()
for (i in names(res)){
volcano_plots[[i]] <- ggVolcano(x = res[[i]],
fdr = 0.05,
fdr.column = "adj.P.Val",
pvalue.column = "P.Value",
logFC = 0,
logFC.column = "logFC",
text.size = 2) +
theme_bw(base_size = 10) +
ggtitle(i)
}
## Warning in max(x[get(fdr.column) <= .fdr][, get(pvalue.column)]): no
## non-missing arguments to max; returning -Inf
## Warning in max(x[get(fdr.column) <= .fdr][, get(pvalue.column)]): no
## non-missing arguments to max; returning -Inf
## Warning in max(x[get(fdr.column) <= .fdr][, get(pvalue.column)]): no
## non-missing arguments to max; returning -Inf
patchwork::wrap_plots(volcano_plots, ncol = 3)
library(ggrepel)
# Create a named vector for ENSEMBL to Genes mapping
ensembl_to_Genes <- setNames(annot_reordered$Genes, annot_reordered$ENSEMBL)
# Map ENSEMBL IDs to Gene Names in the `res` list
for (contrast_name in names(res)) {
# Ensure the dataframe has ENSEMBL IDs as rownames
if (!"ENSEMBL" %in% colnames(res[[contrast_name]])) {
res[[contrast_name]]$ENSEMBL <- rownames(res[[contrast_name]])
}
# Map Gene Names using the annotation
res[[contrast_name]]$Genes <- ensembl_to_Genes[res[[contrast_name]]$ENSEMBL]
# Replace NA values in Genes with ENSEMBL IDs (to ensure plotting works even if some gene names are missing)
res[[contrast_name]]$Genes[is.na(res[[contrast_name]]$Genes)] <- res[[contrast_name]]$ENSEMBL[is.na(res[[contrast_name]]$Genes)]
}
# Load the volcano plot helper function
source("volcano_helpers.R")
# Create lists for volcano plots with and without labels
volcano_plots_no_labels <- list()
volcano_plots_with_labels <- list()
# Iterate over each contrast and create volcano plots
for (contrast_name in names(res)) {
# Ensure the Genes column is present for labeling
if (!"Genes" %in% colnames(res[[contrast_name]])) {
res[[contrast_name]]$Genes <- sapply(rownames(res[[contrast_name]]), function(x) gsub(".*_", "", x))
}
# Generate volcano plots using the helper function
volcano_plots <- create_custom_volcano_plot(
df = res[[contrast_name]],
logFC_col = "logFC",
pvalue_col = "P.Value",
adj_pvalue_col = "adj.P.Val",
contrast_name = contrast_name,
fc_cutoff = 0, # Set fold-change cutoff for significance
pvalue_cutoff = 0.05, # Set p-value cutoff
save_plot = TRUE,
output_path = "../output/", # Adjust output path if needed
show_labels = TRUE # Generate both labeled and unlabeled plots
)
# Store the plots
volcano_plots_no_labels[[contrast_name]] <- volcano_plots$No_Labels
volcano_plots_with_labels[[contrast_name]] <- volcano_plots$With_Labels
}
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 381 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 166 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Combine and display volcano plots without labels
combined_volcano_no_labels <- patchwork::wrap_plots(volcano_plots_no_labels, ncol = 3)
# Combine and display volcano plots with labels
combined_volcano_with_labels <- patchwork::wrap_plots(volcano_plots_with_labels, ncol = 3)
# Display the combined volcano plots
print(combined_volcano_no_labels)
print(combined_volcano_with_labels)
## Warning: ggrepel: 88 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 452 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 48 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 65 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 216 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Print individual volcano plots with labels for key contrasts
print(volcano_plots_with_labels[["AF_vs_Sham_RA"]])
print(volcano_plots_with_labels[["AF_vs_Sham_LA"]])
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
print(volcano_plots_with_labels[["Metformin_vs_AF_RA"]])
print(volcano_plots_with_labels[["Metformin_vs_AF_LA"]])
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.0.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] C
##
## time zone: Europe/Copenhagen
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggrepel_0.9.6 sva_3.54.0
## [3] BiocParallel_1.39.0 genefilter_1.88.0
## [5] mgcv_1.9-1 nlme_3.1-166
## [7] aamisc_0.1.5 fdrtool_1.2.18
## [9] Metrics_0.1.4 SummarizedExperiment_1.36.0
## [11] Biobase_2.66.0 GenomicRanges_1.58.0
## [13] GenomeInfoDb_1.42.0 IRanges_2.40.0
## [15] S4Vectors_0.44.0 BiocGenerics_0.52.0
## [17] MatrixGenerics_1.18.0 DEP_1.28.0
## [19] preprocessCore_1.68.0 DEqMS_1.24.0
## [21] matrixStats_1.4.1 RColorBrewer_1.1-3
## [23] missForest_1.5 dplyr_1.1.4
## [25] openxlsx_4.2.7.1 patchwork_1.3.0
## [27] data.table_1.16.2 ggplot2_3.5.1
## [29] stringr_1.5.1 tibble_3.2.1
## [31] magrittr_2.0.3 biomaRt_2.62.0
## [33] readxl_1.4.3 readr_2.1.5
## [35] edgeR_4.4.0 limma_3.62.1
## [37] pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] splines_4.4.2 later_1.3.2
## [3] norm_1.0-11.1 filelock_1.0.3
## [5] R.oo_1.27.0 cellranger_1.1.0
## [7] XML_3.99-0.17 lifecycle_1.0.4
## [9] httr2_1.0.6 doParallel_1.0.17
## [11] vroom_1.6.5 lattice_0.22-6
## [13] MASS_7.3-61 MultiAssayExperiment_1.32.0
## [15] sass_0.4.9 rmarkdown_2.29
## [17] jquerylib_0.1.4 yaml_2.3.10
## [19] httpuv_1.6.15 doRNG_1.8.6
## [21] zip_2.3.1 MsCoreUtils_1.18.0
## [23] DBI_1.2.3 abind_1.4-8
## [25] zlibbioc_1.52.0 R.utils_2.12.3
## [27] purrr_1.0.2 HarmonicRegression_1.0
## [29] AnnotationFilter_1.30.0 itertools_0.1-3
## [31] rappdirs_0.3.3 sandwich_3.1-1
## [33] circlize_0.4.16 GenomeInfoDbData_1.2.13
## [35] annotate_1.84.0 MSnbase_2.31.1
## [37] ncdf4_1.23 codetools_0.2-20
## [39] DelayedArray_0.32.0 DT_0.33
## [41] xml2_1.3.6 tidyselect_1.2.1
## [43] gmm_1.8 shape_1.4.6.1
## [45] UCSC.utils_1.2.0 farver_2.1.2
## [47] gmp_0.7-5 BiocFileCache_2.14.0
## [49] jsonlite_1.8.9 multtest_2.62.0
## [51] GetoptLong_1.0.5 survival_3.7-0
## [53] iterators_1.0.14 systemfonts_1.1.0
## [55] foreach_1.5.2 tools_4.4.2
## [57] progress_1.2.3 ragg_1.3.3
## [59] Rcpp_1.0.13-1 glue_1.8.0
## [61] gridExtra_2.3 SparseArray_1.6.0
## [63] xfun_0.49 qvalue_2.38.0
## [65] shinydashboard_0.7.2 withr_3.0.2
## [67] BiocManager_1.30.25 fastmap_1.2.0
## [69] fansi_1.0.6 digest_0.6.37
## [71] mime_0.12 R6_2.5.1
## [73] textshaping_0.4.0 imputeLCMD_2.1
## [75] colorspace_2.1-1 RSQLite_2.3.8
## [77] R.methodsS3_1.8.2 hexbin_1.28.4
## [79] utf8_1.2.4 tidyr_1.3.1
## [81] generics_0.1.3 prettyunits_1.2.0
## [83] PSMatch_1.10.0 httr_1.4.7
## [85] htmlwidgets_1.6.4 S4Arrays_1.6.0
## [87] pkgconfig_2.0.3 NISTunits_1.0.1
## [89] gtable_0.3.6 blob_1.2.4
## [91] ComplexHeatmap_2.22.0 impute_1.80.0
## [93] XVector_0.46.0 htmltools_0.5.8.1
## [95] rain_1.40.0 MALDIquant_1.22.3
## [97] ProtGenerics_1.38.0 clue_0.3-65
## [99] scales_1.3.0 tmvtnorm_1.6
## [101] png_0.1-8 knitr_1.49
## [103] rstudioapi_0.17.1 tzdb_0.4.0
## [105] reshape2_1.4.4 rjson_0.2.23
## [107] curl_6.0.0 cachem_1.1.0
## [109] zoo_1.8-12 GlobalOptions_0.1.2
## [111] parallel_4.4.2 AnnotationDbi_1.68.0
## [113] mzID_1.44.0 vsn_3.74.0
## [115] pillar_1.9.0 grid_4.4.2
## [117] vctrs_0.6.5 pcaMethods_1.98.0
## [119] promises_1.3.0 randomForest_4.7-1.2
## [121] dbplyr_2.5.0 xtable_1.8-4
## [123] cluster_2.1.6 evaluate_1.0.1
## [125] mvtnorm_1.3-2 cli_3.6.3
## [127] locfit_1.5-9.10 compiler_4.4.2
## [129] rlang_1.1.4 crayon_1.5.3
## [131] rngtools_1.5.2 labeling_0.4.3
## [133] QFeatures_1.16.0 affy_1.84.0
## [135] plyr_1.8.9 stringi_1.8.4
## [137] assertthat_0.2.1 munsell_0.5.1
## [139] Biostrings_2.74.0 lazyeval_0.2.2
## [141] Matrix_1.7-1 hms_1.1.3
## [143] bit64_4.5.2 KEGGREST_1.46.0
## [145] statmod_1.5.0 shiny_1.9.1
## [147] mzR_2.40.0 igraph_2.1.1
## [149] memoise_2.0.1 affyio_1.76.0
## [151] bslib_0.8.0 bit_4.5.0